##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.6.2
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
##
## count
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
The TCGA MAF summary file
maf_file <- "/media/theron/My_Passport/TCGA_junctions/maf_summary.txt"
mc3_maf = read.table(maf_file,header=T)
mc3_maf$Tumor_Sample_ID <- vapply(TCGAbarcode(mc3_maf$Tumor_Sample_Barcode,sample=T),
function(val){substr(val,1,nchar(val)-1)},
character(1))
rownames(mc3_maf) <- mc3_maf$Tumor_Sample_Barcode
mc3_maf$participant_ID <- TCGAbarcode(mc3_maf$Tumor_Sample_Barcode,participant=T)
mut_sig_perc <- readRDS("/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/mut_sig_percentages.rds")
mut_sig_perc$sample_ID<-vapply(TCGAbarcode(rownames(mut_sig_perc),sample=T),function(sample){
substr(sample,1,nchar(sample)-1)
},character(1))
apobec <- c("T[C>T]A","T[C>T]T","T[C>G]A","T[C>G]T")
junc_rse_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/CHOL/juncrse.rds"
junc_rse <- readRDS(junc_rse_file)
junc_metadata <- as.data.frame(junc_rse@colData@listData)
junc_rse_cols <- colnames(junc_metadata)
tumor_data_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/filenames.txt"
tumor_data <- read.table(tumor_data_file)
cancers <- basename(tumor_data$V1)
# TMB<-list()
cluster_metrics_tum <- data.frame(cancers)
TMB_all <- data.frame(cancers)
TMB_all$av_cor <- NA
TMB_all$av_pval <- NA
TMB_all$med_cor <- NA
TMB_all$med_pval <- NA
TMB_all$max_cor <- NA
TMB_all$max_pval <- NA
rownames(TMB_all) <- cancers
all_genes <- c()
for (i in seq(nrow(tumor_data))){
print(sprintf("%d out of %d",i,nrow(tumor_data)))
tumor_dir <- tumor_data[i,]
cancer <- basename(tumor_dir)
print(cancer)
tumor_meta_file <- sprintf("%s/%s_metadata.txt",tumor_dir,cancer)
tumor_meta <- read.table(tumor_meta_file,quote="",sep="\t")
tumor_meta$participant_ID <- TCGAbarcode(tumor_meta[,4],participant=T)
tumor_meta$nbases<-tumor_meta[,ncol(tumor_meta)-9]
mc3_maf_small<-subset(mc3_maf,participant_ID %in% tumor_meta$participant_ID)
mc3_maf_small <- mc3_maf_small[complete.cases(mc3_maf_small),]
mc3_maf_small$type <- vapply(rownames(mc3_maf_small),function(barcode){
type<-as.numeric(substr(strsplit(barcode,"-")[[1]][4],1,2))
if (type <= 9){
return("T")
} else if (type > 9 & type <= 19){
return ("N")
} else {
return ("C")
}
},character(1))
mc3_maf_small$TMB<-log10(mc3_maf_small$total+1)
mc3_maf_small <- mc3_maf_small %>% dplyr::filter(type == "T")
tumor_geno_file <- sprintf("%s/%s_genotypes.txt",tumor_dir,cancer)
tumor_geno <- read.table(tumor_geno_file,header=T)
tumor_geno <- tumor_geno %>% dplyr::filter(type=="T")
external_ids <- tumor_geno$external_id
splice_mut_file <- sprintf("%s/filenames.txt",tumor_dir,cancer)
splice_mut_files <- read.table(splice_mut_file)
for (j in seq(length(splice_mut_files$V1))){
if (j == 1){
combined_gene_metric <- readRDS(splice_mut_files$V1[j])
} else {
combined_gene_metric <- cbind(combined_gene_metric,readRDS(splice_mut_files$V1[j]))
}
}
combined_gene_metric[combined_gene_metric==-Inf]<-0
combined_gene_metric[is.na(combined_gene_metric)]<-0
combined_gene_metric <- combined_gene_metric[,external_ids]
all_genes <- unique(c(all_genes,rownames(combined_gene_metric)))
colnames(combined_gene_metric) <- vapply(colnames(combined_gene_metric),function(col_name){
col_name<-str_replace(col_name,"X","")
col_name <- str_replace_all(col_name,"[.]","-")
tumor_geno$sample_id[which(tumor_geno$external_id == col_name)[1]]
},character(1))
mc3_maf_small <- mc3_maf_small %>% dplyr::filter(Tumor_Sample_ID %in% colnames(combined_gene_metric))
combined_gene_metric<-combined_gene_metric[,mc3_maf_small$Tumor_Sample_ID]
combined_gene_metric_per_sample<-data.frame(colnames(combined_gene_metric))
combined_gene_metric_per_sample$av <- vapply(seq(ncol(combined_gene_metric)),
function(col_val){mean(as.numeric(combined_gene_metric[,col_val]),na.rm=T)},
numeric(1))
combined_gene_metric_per_sample$med <- vapply(seq(ncol(combined_gene_metric)),
function(col_val){median(as.numeric(combined_gene_metric[,col_val]),na.rm=T)},
numeric(1))
combined_gene_metric_per_sample$max <- vapply(seq(ncol(combined_gene_metric)),
function(col_val){max(as.numeric(combined_gene_metric[,col_val]),na.rm=T)},
numeric(1))
combined_gene_metric_per_sample$sum <- vapply(seq(ncol(combined_gene_metric)),
function(col_val){sum(as.numeric(combined_gene_metric[,col_val]),na.rm=T)},
numeric(1))
combined_gene_metric_per_sample$TMB <- mc3_maf_small$TMB
combined_gene_metric_per_sample$cancer <- cancer
colnames(combined_gene_metric_per_sample) <- c("sample","gene_metric_av","gene_metric_med",
"gene_metric_max","gene_metric_sum","TMB","cancer")
combined_gene_metric_perc <- data.frame(t(vapply(combined_gene_metric_per_sample$sample,function(samp){
a<-which(mut_sig_perc$sample_ID == samp)
if (length(a)==0){
return(rep(-1,ncol(mut_sig_perc)-1))
} else {
return(as.numeric(mut_sig_perc[a,seq(96)]))
}
},numeric(ncol(mut_sig_perc)-1))))
colnames(combined_gene_metric_perc)<-colnames(mut_sig_perc)[seq(96)]
if (i == 1){
combined_gene_metric_per_sample_all <- combined_gene_metric_per_sample
combined_gene_metric_perc_all <- combined_gene_metric_perc
} else {
combined_gene_metric_per_sample_all <- rbind(combined_gene_metric_per_sample_all,combined_gene_metric_per_sample)
combined_gene_metric_perc_all <- rbind( combined_gene_metric_perc_all, combined_gene_metric_perc)
}
# combined_gene_metric_log10<-as.matrix(log10(combined_gene_metric+1))
# colnames(combined_gene_metric_log10)<-colnames(combined_gene_metric)
#
# print(Heatmap(combined_gene_metric_log10,
# top_annotation = HeatmapAnnotation(TMB=anno_barplot(mc3_maf_small$TMB)),
# show_row_names=F,
# show_column_names = F,
# cluster_rows=T,
# cluster_columns=T))
a<-cor.test(combined_gene_metric_per_sample$gene_metric_av,combined_gene_metric_per_sample$TMB,method="pearson")
TMB_all[cancer,"av_cor"]<-a$estimate
TMB_all[cancer,"av_pval"]<-a$p.value
a<-cor.test(combined_gene_metric_per_sample$gene_metric_med,combined_gene_metric_per_sample$TMB,method="pearson")
TMB_all[cancer,"med_cor"]<-a$estimate
TMB_all[cancer,"med_pval"]<-a$p.value
a<-cor.test(combined_gene_metric_per_sample$gene_metric_max,combined_gene_metric_per_sample$TMB,method="pearson")
TMB_all[cancer,"max_cor"]<-a$estimate
TMB_all[cancer,"max_pval"]<-a$p.value
combined_gene_metric_perc_cor_pval <- data.frame(t(vapply(seq(ncol(mut_sig_perc)-1),function(col_val){
ret_val <- c()
mut_col <- as.numeric(combined_gene_metric_perc[,col_val])
aa<-data.frame(combined_gene_metric_per_sample$gene_metric_av,mut_col)
colnames(aa)<-c("gene_metric_av","mut_col")
aa<-aa %>% dplyr::filter(mut_col >= 0)
a<-cor.test(aa$gene_metric_av,aa$mut_col,method="pearson")
ret_val <- c(ret_val,a$estimate,a$p.value)
aa<-data.frame(combined_gene_metric_per_sample$gene_metric_med,mut_col)
colnames(aa)<-c("gene_metric_med","mut_col")
aa<-aa %>% dplyr::filter(mut_col >= 0)
a<-cor.test(aa$gene_metric_med,aa$mut_col,method="pearson")
ret_val <- c(ret_val,a$estimate,a$p.value)
aa<-data.frame(combined_gene_metric_per_sample$gene_metric_max,mut_col)
colnames(aa)<-c("gene_metric_max","mut_col")
aa<-aa %>% dplyr::filter(mut_col >= 0)
a<-cor.test(aa$gene_metric_max,aa$mut_col,method="pearson")
ret_val <- c(ret_val,a$estimate,a$p.value)
},numeric(6))))
colnames(combined_gene_metric_perc_cor_pval)<-c("av_cor","av_pval","med_cor","med_pval","max_cor","max_pval")
rownames(combined_gene_metric_perc_cor_pval) <- colnames(combined_gene_metric_perc)
print(ggplot(combined_gene_metric_per_sample,aes(x=log10(gene_metric_av+1),y=TMB))+
geom_point()+
stat_cor(method = "pearson")+
geom_smooth(method="lm")+
labs(title=sprintf(cancer)))
print(ggplot(combined_gene_metric_per_sample,aes(x=log10(gene_metric_med+1),y=TMB))+
geom_point()+
stat_cor(method = "pearson")+
geom_smooth(method="lm")+
labs(title=sprintf(cancer)))
print(ggplot(combined_gene_metric_per_sample,aes(x=log10(gene_metric_max+1),y=TMB))+
geom_point()+
stat_cor(method = "pearson")+
geom_smooth(method="lm")+
labs(title=sprintf(cancer)))
combined_gene_metric_perc_cor_pval <- combined_gene_metric_perc_cor_pval %>% dplyr::filter(av_pval <= 0.05 | med_pval <= 0.05 | max_pval <= 0.05)
rownames(combined_gene_metric_perc_cor_pval) <- vapply(rownames(combined_gene_metric_perc_cor_pval),function(rname){
if (rname %in% apobec){
return(sprintf("%s:abobec",rname))
} else {
return(rname)
}
},character(1))
combined_gene_metric_perc_cor <- combined_gene_metric_perc_cor_pval[,c("av_cor","med_cor","max_cor")]
combined_gene_metric_perc_pval <- combined_gene_metric_perc_cor_pval[,c("av_pval","med_pval","max_pval")]
if (nrow(combined_gene_metric_perc_cor_pval) > 0){
print(Heatmap(as.matrix(combined_gene_metric_perc_cor), cell_fun = function(j, i, x, y, w, h, fill) {
if(combined_gene_metric_perc_pval[i, j] < 0.005) {
grid.text("**", x, y)
} else if(combined_gene_metric_perc_pval[i, j] < 0.05) {
grid.text("*", x, y)
}
},name=sprintf("%s: splicemut vs mutation type",cancer),cluster_columns=F))
} else {
print(sprintf("%s: no significant mutation trends",cancer))
}
}
## [1] "1 out of 14"
## [1] "BLCA"
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## [1] "2 out of 14"
## [1] "BRCA"
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## number of items read is not a multiple of the number of columns
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## [1] "3 out of 14"
## [1] "CHOL"
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## [1] "4 out of 14"
## [1] "COAD"
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## [1] "5 out of 14"
## [1] "HNSC"
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## [1] "6 out of 14"
## [1] "KICH"
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## [1] "7 out of 14"
## [1] "KIRP"
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## [1] "8 out of 14"
## [1] "LIHC"
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## [1] "9 out of 14"
## [1] "LUAD"
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## [1] "10 out of 14"
## [1] "LUSC"
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## [1] "11 out of 14"
## [1] "PRAD"
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## [1] "12 out of 14"
## [1] "READ"
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## [1] "13 out of 14"
## [1] "THCA"
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## [1] "14 out of 14"
## [1] "UCEC"
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
datatable(TMB_all, caption = "TMB")
# ggplot(combined_gene_metric_per_sample_all,aes(x=gene_metric_av,y=TMB))+
# geom_point()+
# stat_cor(method = "pearson",
# label.x.npc = "center",
# label.y.npc = "top")+
# geom_smooth(method="lm")+
# labs("All Samples")
#
# ggplot(combined_gene_metric_per_sample_all,aes(x=gene_metric_med,y=TMB))+
# geom_point()+
# stat_cor(method = "pearson",
# label.x.npc = "center",
# label.y.npc = "top")+
# geom_smooth(method="lm")+
# labs("All Samples")
ggplot(combined_gene_metric_per_sample_all,aes(x=log10(gene_metric_med+1),y=TMB))+
geom_point()+
stat_cor(method = "pearson")+
geom_smooth(method="lm")+
labs("All Samples")
## `geom_smooth()` using formula 'y ~ x'
ggplot(combined_gene_metric_per_sample_all,aes(x=log10(gene_metric_av+1),y=TMB))+
geom_point()+
stat_cor(method = "pearson")+
geom_smooth(method="lm")+
labs("All Samples")
## `geom_smooth()` using formula 'y ~ x'
ggplot(combined_gene_metric_per_sample_all,aes(x=log10(gene_metric_max+1),y=TMB))+
geom_point()+
stat_cor(method = "pearson")+
geom_smooth(method="lm")+
labs("All Samples")
## `geom_smooth()` using formula 'y ~ x'